TL;DR

The goal of this document is to look at residuals and see if we could use the residuals to look at some sort of adjusted “counts” (residuals can be negative, so we should probably find another name than “counts”). First, I fitted zinbwave without the batches in the X, then I included the batches in the X. For each scenario, I looked at naive, naive on the log scale, Pearson, and deviance residuals. I plotted the PCA of the residuals with two coloring (batch and clusters) and boxplots of the residuals for each cell.

Unnormalized data

I am still using the data Davide gave me a while ago. And I think now it is not the same data Davide is using in its most recent vignette (that is zinb_clustering_over_k.Rmd). It should not be very important but it would be nice if we have a common dataset we can work on.

load("../data/Expt4c2b_filtdata.Rda")
load("../data/E4c2b_slingshot_wsforkelly.RData")
de = read.csv('../data/oe_markers.txt', stringsAsFactors = F, header = F)
de = de$V1

Here we only look at all the genes.

names(batch) <- colnames(counts)

counts <- counts[,names(clus.labels)]
batch <- droplevels(batch[names(clus.labels)])
qc <- qc[names(clus.labels),]

#vars <- rowVars(log1p(counts))
#names(vars) <- rownames(counts)
#vars <- sort(vars, decreasing = TRUE)
#core <- counts[names(vars)[1:1000],]
core = counts

We have 616 cells. To speed up the computations, we will focus on the top 1,000 most variable genes.

dim(core)
## [1] 12781   616
core[1:3, 1:3]
##              OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER                      1            3022            2329
## Xkr4                       0               0               0
## LOC102640625               1               9               2

Cells have been processed in 18 different batches

table(batch)
## batch
## GBC08A GBC08B GBC09A GBC09B    P01    P02   P03A   P03B    P04    P05 
##     36     33     30     17     25     43     37     33     14     20 
##    P06    P10    P11    P12    P13    P14    Y01    Y04 
##     39     36     44     40     47     39     51     32

Cells have been clustered into 13 different clusters

table(clus.labels)
## clus.labels
##  1  2  3  4  5  7  8  9 10 11 12 14 15 
## 91 25 56 40 96 60 28 79 26 22 35 26 32

Batches are kind of confounded with the biology

table(data.frame(batch = as.vector(batch),
                 cluster = clus.labels))
##         cluster
## batch     1  2  3  4  5  7  8  9 10 11 12 14 15
##   GBC08A  0  2 12  9  0  0  0  0  0  2  0  2  9
##   GBC08B  0  7  5  3  0  0  0  1  2  4  0  5  6
##   GBC09A  0  1  5  9  0  0  0  1  1  0  0  6  7
##   GBC09B  0  2  2  7  0  0  0  3  0  0  0  3  0
##   P01     0  2  4  3 15  1  0  0  0  0  0  0  0
##   P02     2  0  9  3 15  3  3  2  3  0  2  1  0
##   P03A    3  0  3  0 12  2  9  4  2  0  2  0  0
##   P03B    1  2  1  1 11  1  2 10  1  1  2  0  0
##   P04     0  0  0  0 11  1  0  1  1  0  0  0  0
##   P05     0  0  0  1 11  3  0  1  0  2  2  0  0
##   P06     1  2  3  0  8  2  4  8  4  1  2  2  2
##   P10     3  1  4  0  4  5  9  2  0  2  5  0  1
##   P11     2  1  1  0  1  5  1 22  3  1  6  0  1
##   P12     0  2  0  0  4 10  0  8  2  3  6  4  1
##   P13     1  2  4  0  4 15  0  4  5  6  1  3  2
##   P14     0  0  1  2  0 12  0 12  2  0  7  0  3
##   Y01    47  1  1  2  0  0  0  0  0  0  0  0  0
##   Y04    31  0  1  0  0  0  0  0  0  0  0  0  0
plotBoxplot <- function(y, main = '', col, log = F){
  if (log) y = log2(y + 1)
  boxplot(y, main=main, col=col, staplewex=0, outline=0, 
          border=col, xaxt='n')
  abline(h=0)
}

compute.naive.residuals <- function(Y, zinb){
  mu_hat = t(getMu(zinb)) 
  pi_hat = t(getPi(zinb)) 
  Y_hat = (1 - pi_hat) * mu_hat
  Y - Y_hat 
}

compute.log.naive.residuals <- function(Y, zinb){
  mu = t(getMu(zinb)) 
  pi = t(getPi(zinb)) 
  phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
  var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
  Y_hat = (1 - pi) * mu
  log_Y_hat_plus_1 = log(1 + Y_hat) - var_hat / (2*(1 + Y_hat)^2)
  log(Y + 1) - log_Y_hat_plus_1
}


compute.pearson.residuals <- function(Y, zinb){
  num = compute.naive.residuals(Y, zinb)
  mu = t(getMu(zinb)) 
  pi = t(getPi(zinb)) 
  phi = matrix(rep(getPhi(zinb), ncol(Y)), ncol = ncol(Y))
  var_hat = (1 - pi) * mu * (1 + mu * (phi + pi))
  num / sqrt(var_hat)  
}

compute.zinb.loglik <- function(Y, zinb){
  mu = t(getMu(zinb)) 
  theta = getTheta(zinb)
  theta_mat = matrix(rep(theta, ncol(Y), ncol = ncol(Y)))
  pi = t(getPi(zinb))
  log( pi * (Y == 0) + (1 - pi) * dnbinom(Y, size = theta, mu = mu) )
}

compute.deviance.residuals <- function(Y, zinb){
  mu_hat = t(getMu(zinb)) 
  pi_hat = t(getPi(zinb)) 
  Y_hat = (1 - pi_hat) * mu_hat
  ll = compute.zinb.loglik(Y, zinb)
  sign = 1*(Y - Y_hat > 0)
  sign[sign == 0] = -1
  sign * sqrt(-2 * ll)
}

X intercept, no batch

Let’s run zinbwave with K = 0 and X with an intercept only (no batch).

print(system.time(zinb <- zinbFit(core, ncores = NCORES, K = 0)))
##     user   system  elapsed 
## 1827.539   38.656  532.069

Naive residuals

Rn = compute.naive.residuals(core, zinb)
Rn[1:3,1:3]
##              OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER           -1054.182261   2094.19514029     1668.865489
## Xkr4              -13.532615    -11.20593481       -7.164258
## LOC102640625       -8.996483      0.03197009       -4.641870

PCA (centered not scale) of residuals, first colored by batch (left), then by clusters (right). On the left side, we should not see patterns whereas on the right hand side we should.

pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
de_1000 = de[de %in% rownames(Rn)]
length(de_1000)
## [1] 115
head(de_1000)
## [1] "Ascl1"   "Ccnd1"   "Kit"     "Lgr5"    "Neurod1" "Neurog1"

Each boxplot is for a cell. Colors correspond to the batches. When colors are the same but boxplots are not next to each others it corresponds to different batches. We would expect that the residuals look similar for the different batches.

Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

breaks = setBreaks(Rn[de_1000, ], breaks = 0.99)
heatmap.2(Rn[de_1000, ], breaks = breaks,
          col = colorRampPalette(c('blue', 'yellow'))(51),
          scale = 'none', trace = 'none',
          ColSideColors = as.character(clus.labels),
          labCol = '', main = 'Naive Residuals')

Naive residuals of log(Y+1)

Rn = compute.log.naive.residuals(core, zinb)
Rn[1:3,1:3]
##              OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER             -3.8809820        3.626390       3.8244528
## Xkr4              10.6931681       11.616382      12.9172753
## LOC102640625      -0.1836331        1.497418       0.4742386
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

heatmap.2(Rn[de_1000, ],
          col = colorRampPalette(c('blue', 'yellow'))(51),
          scale = 'none', trace = 'none',
          ColSideColors = as.character(clus.labels),
          labCol = '', main = 'Log naive residuals')

Pearson residuals

Rp = compute.pearson.residuals(core, zinb)
Rp[1:3,1:3]
##              OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER             -0.4566869     1.019352273       1.1145278
## Xkr4              -0.1800795    -0.172770975      -0.1601206
## LOC102640625      -0.4691034     0.001855295      -0.3618107
pca = prcomp(t(Rp))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rp_order = Rp[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rp_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

breaks = setBreaks(Rp[de_1000, ], breaks = 0.99)
heatmap.2(Rp[de_1000, ],breaks = breaks,
          col = colorRampPalette(c('blue', 'yellow'))(51),
          scale = 'none', trace = 'none',
          ColSideColors = as.character(clus.labels),
          labCol = '', main = 'Pearson residuals')

Deviance residuals

Rd = compute.deviance.residuals(core, zinb)
Rd[1:3,1:3]
##              OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER             -2.7838750        4.525812       4.4807724
## Xkr4              -0.4961831       -0.473638      -0.4331959
## LOC102640625      -2.1494309        2.835279      -2.3243557
pca = prcomp(t(Rd))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rd_order = Rd[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rd_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

heatmap.2(Rd[de_1000, ],
          col = colorRampPalette(c('blue', 'yellow'))(51),
          scale = 'none', trace = 'none',
          ColSideColors = as.character(clus.labels),
          labCol = '', main = 'Deviance residuals')

X batches

Let’s run zinbwave with K = 0 and X with an intercept only (no batch).

mod = model.matrix( ~ batch)
print(system.time(zinb_batch <- zinbFit(core, ncores = NCORES, K = 0,
                                  X = mod)))
##     user   system  elapsed 
## 3300.844   55.424  715.158

Naive residuals

Rn = compute.naive.residuals(core, zinb_batch)
Rn[1:3,1:3]
##              OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER           -2156.802509     1023.810053      891.511804
## Xkr4               -5.206859       -4.448180       -2.818703
## LOC102640625       -3.536348        4.783501       -1.053260
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

breaks = setBreaks(Rn[de_1000, ], breaks = 0.99)
heatmap.2(Rn[de_1000, ], breaks = breaks,
          col = colorRampPalette(c('blue', 'yellow'))(51),
          scale = 'none', trace = 'none',
          ColSideColors = as.character(clus.labels),
          labCol = '', main = 'Naive Residuals')

Naive residuals of log(Y+1)

Rn = compute.log.naive.residuals(core, zinb)
Rn[1:3,1:3]
##              OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER             -3.8809820        3.626390       3.8244528
## Xkr4              10.6931681       11.616382      12.9172753
## LOC102640625      -0.1836331        1.497418       0.4742386
pca = prcomp(t(Rn))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rn_order = Rn[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rn_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

heatmap.2(Rn[de_1000, ],
          col = colorRampPalette(c('blue', 'yellow'))(51),
          scale = 'none', trace = 'none',
          ColSideColors = as.character(clus.labels),
          labCol = '', main = 'Log naive residuals')

Pearson residuals

Rp = compute.pearson.residuals(core, zinb_batch)
Rp[1:3,1:3]
##              OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER             -0.5703063       0.2916186       0.3514468
## Xkr4              -0.2181257      -0.2086423      -0.1942887
## LOC102640625      -0.4392369       0.6375194      -0.1911441
pca = prcomp(t(Rp))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rp_order = Rp[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rp_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

breaks = setBreaks(Rp[de_1000, ], breaks = 0.99)
heatmap.2(Rp[de_1000, ],breaks = breaks,
          col = colorRampPalette(c('blue', 'yellow'))(51),
          scale = 'none', trace = 'none',
          ColSideColors = as.character(clus.labels),
          labCol = '', main = 'Pearson residuals')

Deviance residuals

Rd = compute.deviance.residuals(core, zinb_batch)
Rd[1:3,1:3]
##              OEP01_N706_S501 OEP01_N701_S501 OEP01_N707_S507
## CreER             -2.8670993       4.4156535       4.3602138
## Xkr4              -0.5505718      -0.5241628      -0.4794994
## LOC102640625      -2.0274327       2.8304297      -2.2227802
pca = prcomp(t(Rd))
par(mfrow = c(1,2))
plot(pca$x, col = batch, pch = 19,
     main="PCA of residuals\ncolor=batch")
plot(pca$x, col = clus.labels, pch = 19,
     main = "PCA of residuals\ncolor=cluster")

par(mfrow = c(1,1))
Rd_order = Rd[, order(as.numeric(batch))]
col_order = as.numeric(batch)[order(as.numeric(batch))]
plotBoxplot(Rd_order, main = 'Boxplot of residuals\ncolor=batch',
            col = col_order)

heatmap.2(Rd[de_1000, ],
          col = colorRampPalette(c('blue', 'yellow'))(51),
          scale = 'none', trace = 'none',
          ColSideColors = as.character(clus.labels),
          labCol = '', main = 'Deviance residuals')

Conclusions

As batches are somehow confounded with the biology, I don’t know if we should adjust or not for the batches. For the residuals, it seems that the Pearson residuals when not adjusting for batches is the most informative (at least, when looking at PC 1 and 2).

Session Info

sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS release 6.9 (Final)
## 
## locale:
##  [1] LC_CTYPE=ja_JP.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=ja_JP.UTF-8        LC_COLLATE=ja_JP.UTF-8    
##  [5] LC_MONETARY=ja_JP.UTF-8    LC_MESSAGES=ja_JP.UTF-8   
##  [7] LC_PAPER=ja_JP.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=ja_JP.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    methods   stats     graphics  grDevices utils    
## [8] datasets  base     
## 
## other attached packages:
##  [1] clusterExperiment_1.2.0    gplots_3.0.1              
##  [3] matrixStats_0.52.2         zinbwave_0.99.1           
##  [5] SummarizedExperiment_1.4.0 Biobase_2.34.0            
##  [7] GenomicRanges_1.26.4       GenomeInfoDb_1.10.3       
##  [9] IRanges_2.8.2              S4Vectors_0.12.2          
## [11] BiocGenerics_0.20.0        knitr_1.16                
## 
## loaded via a namespace (and not attached):
##   [1] nlme_3.1-131       bitops_1.0-6       bold_0.4.0        
##   [4] progress_1.1.2     doParallel_1.0.10  RColorBrewer_1.1-2
##   [7] httr_1.2.1         rprojroot_1.2      prabclus_2.2-6    
##  [10] numDeriv_2016.8-1  tools_3.3.3        backports_1.1.0   
##  [13] R6_2.2.1           KernSmooth_2.23-15 DBI_0.6-1         
##  [16] lazyeval_0.2.0     colorspace_1.3-2   ade4_1.7-6        
##  [19] trimcluster_0.1-2  nnet_7.3-12        prettyunits_1.0.2 
##  [22] gridExtra_2.2.1    glmnet_2.0-10      pspline_1.0-17    
##  [25] xml2_1.1.1         pkgmaker_0.22      diptest_0.75-7    
##  [28] caTools_1.17.1     scales_0.4.1       DEoptimR_1.0-8    
##  [31] mvtnorm_1.0-6      robustbase_0.92-7  NMF_0.20.6        
##  [34] stringr_1.2.0      digest_0.6.12      rmarkdown_1.5     
##  [37] XVector_0.14.1     htmltools_0.3.6    stabledist_0.7-1  
##  [40] ADGofTest_0.3      limma_3.30.13      rlang_0.1.1       
##  [43] howmany_0.3-1      jsonlite_1.4       mclust_5.3        
##  [46] gtools_3.5.0       dplyr_0.5.0        dendextend_1.5.2  
##  [49] RCurl_1.95-4.8     magrittr_1.5       modeltools_0.2-21 
##  [52] Matrix_1.2-10      Rcpp_0.12.11       munsell_0.4.3     
##  [55] ape_4.1            abind_1.4-5        viridis_0.4.0     
##  [58] stringi_1.1.5      whisker_0.3-2      yaml_2.1.14       
##  [61] MASS_7.3-47        zlibbioc_1.20.0    flexmix_2.3-14    
##  [64] MAST_1.0.5         plyr_1.8.4         grid_3.3.3        
##  [67] gdata_2.17.0       rncl_0.8.2         lattice_0.20-35   
##  [70] splines_3.3.3      uuid_0.1-2         taxize_0.8.4      
##  [73] fpc_2.1-10         rngtools_1.2.4     softImpute_1.4    
##  [76] reshape2_1.4.2     codetools_0.2-15   XML_3.98-1.7      
##  [79] evaluate_0.10      RNeXML_2.0.7       data.table_1.10.4 
##  [82] foreach_1.4.3      locfdr_1.1-8       tidyr_0.6.3       
##  [85] gtable_0.2.0       reshape_0.8.6      assertthat_0.2.0  
##  [88] kernlab_0.9-25     ggplot2_2.2.1      gridBase_0.4-7    
##  [91] phylobase_0.8.4    xtable_1.8-2       class_7.3-14      
##  [94] pcaPP_1.9-61       viridisLite_0.2.0  gsl_1.9-10.3      
##  [97] tibble_1.3.1       copula_0.999-16    iterators_1.0.8   
## [100] registry_0.3       cluster_2.0.6